/*
	Purpose: Using the sample of 1940 Census fathers aged 30-50
	         (from 0b), this file creates father income scores
	         at the occupation x race x region level. Missing
	         income is imputed.

	Creates: avgincomes_fathers1940_byrace_byregion.dta
*/

clear
set more off
cd "$Mydirectory1/1_DataSources/CensusData/"

use ./input/Census1940_fathers_ages30to50_forIncomeScores.dta, clear 

	drop if occ1950ej==99

*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

	gen number=1
	
*******************
*** TEMPLATES
*******************

* Template 1: occupation x race 
preserve

	collapse (min) race, by(occ1950ej)
	
	expand 2
	bysort occ1950ej: replace race=2 if _n==2

	tempfile occbyrace
	save `occbyrace'

restore 
	
* Template 2: race x region
preserve 
	
	collapse (min) race, by(region)
	expand 2
	bysort region: replace race=2 if _n==2
	
	tempfile region
	save `region'
	
restore

* Template 3: occupation x race x region
preserve

	use `occbyrace', clear
	joinby race using `region'
	
	tempfile template
	save `template'
	
restore


*******************
*** COLLAPSE 
*******************

	collapse (rawsum) number  (mean)  incwage fam_income hh_income , by(occ1950ej race region) 

	tempfile income
	save `income'

* Merge into template
	use `template'
	merge 1:1 occ1950ej race region using `income'
	drop _merge
	replace number=0 if number==.

* Count missings
	count if incwage==.
	count if incwage==0
	
	tab occ1950ej race if incwage==.
	
	gen flag_1940_byocc_byr_byreg = incwage==.
	
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

********************************************
*** IMPUTATIONS
********************************************

* Black engineers in the West 
		
	foreach x in  incwage fam_income hh_income {
		sort occ1950ej region race
		by occ1950ej region: gen white_black_ratio = `x'[_n-1] / `x'[_n] 
		
		tab region if `x'==. 
		sum white_black_ratio if region==4 [aw=number] //average racial income gap in the west across occupations
		local ratio1 = r(mean)
		
		sum `x' if region==4 & race==1 & occ1950ej==7
		replace `x' = `r(mean)' / `ratio1' if region==4 & race==2 & occ1950ej==7
		//scale white engineer income in the West by the average racial income gap in the West
		drop white_black_ratio 
	}

* Count missings
	assert incwage!=.
	
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

**********
* Save
**********

	label var incwage "Coarse income score, incwage, by race and region"
	label var hh_income "Coarse income score, household income, by race and region"
	label var fam_income "Coarse income score, family income, by race and region"
	label var flag_1940_byocc_byr_byreg "Flag for imputed value"
	label var race "Respondent race"
	label var number "Number of obs in 1940 occ x race x region cell"

	rename incwage avgincwage_1940_byrace_byregion
	rename fam_income avg_faminc_1940_byrace_byregion
	rename hh_income avg_HHinc_1940_byrace_byregion
	rename occ1950ej fatheroccej
	rename number number_1940obs_byrace_byregion

	save ./output/avgincomes_fathers1940_byrace_byregion.dta, replace
